bitkeeper revision 1.1267 (4251679dHqcry__n_OW9aNhfUtjZ_A)
authormafetter@fleming.research <mafetter@fleming.research>
Mon, 4 Apr 2005 16:13:17 +0000 (16:13 +0000)
committermafetter@fleming.research <mafetter@fleming.research>
Mon, 4 Apr 2005 16:13:17 +0000 (16:13 +0000)
Hand merged

Signed-off-by: michael.fetterman@cl.cam.ac.uk
19 files changed:
1  2 
.rootkeys
BitKeeper/etc/ignore
tools/libxc/xc.h
tools/libxc/xc_plan9_build.c
tools/misc/Makefile
xen/arch/x86/domain.c
xen/arch/x86/domain_build.c
xen/arch/x86/mm.c
xen/arch/x86/shadow.c
xen/arch/x86/traps.c
xen/arch/x86/vmx.c
xen/arch/x86/vmx_io.c
xen/common/grant_table.c
xen/common/page_alloc.c
xen/common/schedule.c
xen/drivers/char/console.c
xen/include/asm-x86/mm.h
xen/include/asm-x86/page.h
xen/include/xen/sched.h

diff --cc .rootkeys
index 9dcab0339696ea1ea0a1616f3569e980be79c944,1db0f788e9d1bd407065347f88445a66c7f5973f..558cd95367ca0e02677a41ba29e96c291e15f441
+++ b/.rootkeys
  41a216cayFe2FQroFuzvNPw1AvNiqQ tools/libxutil/util.c
  41a216ca7mgVSnCBHPCLkGOIqPS1CQ tools/libxutil/util.h
  3f776bd2Xd-dUcPKlPN2vG89VGtfvQ tools/misc/Makefile
 +4225f56d7sa9aEARfjNeCVTMYDAmZA tools/misc/cpuperf/Makefile
 +4225f56dS5TGdKojmuBnrV3PzbE6Rg tools/misc/cpuperf/README.txt
 +4225f56dcodvBSPoWYS6kvwZCQhgzg tools/misc/cpuperf/cpuperf.c
 +4225f56dMjZK14EWd8K0gq4v5Diwjg tools/misc/cpuperf/cpuperf_perfcntr.h
 +4225f56d_XjSY1297IiH96qeqD4sCA tools/misc/cpuperf/cpuperf_xeno.h
 +4225f56dqlGC_UZ681F95mCgLbOeHQ tools/misc/cpuperf/module/Makefile
 +4225f56dnmms-VFr1MiDVG_dYoM7IQ tools/misc/cpuperf/module/perfcntr.c
 +4225f56dYhIGQRD_kKVJ6xQrkqO0YQ tools/misc/cpuperf/p4perf.h
  40ab2cfawIw8tsYo0dQKtp83h4qfTQ tools/misc/fakei386xen
+ 4249273cDOw6_uLUPvvUwWU1ZrJxnQ tools/misc/mbootpack/GPL
+ 4249273cSmj2h8Fj3UpTg0g-k6CLsA tools/misc/mbootpack/Makefile
+ 4249273c8gKIttF1QPiczvGo5AEOeA tools/misc/mbootpack/README
+ 4249273c4N4PAkvt3trNlto4h76k8A tools/misc/mbootpack/bin2c.c
+ 4249273cISg5nhW1Pt7OJ0jFu343ig tools/misc/mbootpack/bootsect.S
+ 4249273cUiz8CgLqnG7XYFa8x5-MoQ tools/misc/mbootpack/buildimage.c
+ 4249273c_gZ2yI_h-ci66E1Y5oSEPA tools/misc/mbootpack/mb_header.h
+ 4249273cWnlW0-lOIYua1bkKirn6vA tools/misc/mbootpack/mb_info.h
+ 4249273cA8LI3IMaSuhLOjykuMeQJA tools/misc/mbootpack/mbootpack.c
+ 4249273cVTgyv2HYd-mC29IDaz0-mg tools/misc/mbootpack/mbootpack.h
+ 4249273cLXQbRWFp_v-FqcyOm0sYtg tools/misc/mbootpack/setup.S
  3f6dc136ZKOjd8PIqLbFBl_v-rnkGg tools/misc/miniterm/Makefile
  3f6dc140C8tAeBfroAF24VrmCS4v_w tools/misc/miniterm/README
  3f6dc142IHaf6XIcAYGmhV9nNSIHFQ tools/misc/miniterm/miniterm.c
index 0817a4ce7f214867eaa7a177325d9f2c9632abfb,1596669450fb7b172527e907421ae2f29ae3bd03..539e97e1b14853b63b8dfffc9fc1e2b723310ae1
@@@ -90,10 -91,12 +91,15 @@@ tools/cmdline/
  tools/cmdline/xen/*
  tools/ioemu/iodev/device-model
  tools/libxc/xen/*
 +tools/misc/cpuperf/cpuperf-perfcntr
 +tools/misc/cpuperf/cpuperf-xen
+ tools/misc/mbootpack/bin2c
+ tools/misc/mbootpack/bootsect
+ tools/misc/mbootpack/bzimage_header.c
+ tools/misc/mbootpack/mbootpack
+ tools/misc/mbootpack/setup
  tools/misc/miniterm/miniterm
 +tools/misc/xc_shadow
  tools/misc/xen_cpuperf
  tools/misc/xenperf
  tools/tests/test_x86_emulator
Simple merge
index c6778d44bc76ce4fc3b15ecaec69b1d83cec038b,3476136196551b86606776d029646769ecb254c2..3476136196551b86606776d029646769ecb254c2
mode 100644,100755..100644
index a1e0165e6a553fac3646158f9e137f65228073cb,f12193eb6e230a32e277bf373272abc1af4453b9..43e68949c24314c337a37404b0c58686d7f45de9
@@@ -21,21 -21,22 +21,25 @@@ INSTALL_SBIN = netfix xm xend xensv xen
  all: build
  build: $(TARGETS)
        $(MAKE) -C miniterm
 +      $(MAKE) -C cpuperf
+       $(MAKE) -C mbootpack
  
  install: build
        [ -d $(DESTDIR)/usr/bin ] || $(INSTALL_DIR) $(DESTDIR)/usr/bin
        [ -d $(DESTDIR)/usr/sbin ] || $(INSTALL_DIR) $(DESTDIR)/usr/sbin
        $(INSTALL_PROG) $(INSTALL_BIN) $(DESTDIR)/usr/bin
        $(INSTALL_PROG) $(INSTALL_SBIN) $(DESTDIR)/usr/sbin
 +      $(MAKE) -C cpuperf install
  #       No sense in installing miniterm on the Xen box.
  #     $(MAKE) -C miniterm install
+ #       Likewise mbootpack
+ #     $(MAKE) -C mbootpack install
  
  clean:
        $(RM) *.o $(TARGETS) *~
        $(MAKE) -C miniterm clean
 +      $(MAKE) -C cpuperf clean
+       $(MAKE) -C mbootpack clean
  
  %.o: %.c $(HDRS) Makefile
        $(CC) -c $(CFLAGS) -o $@ $<
index d06e9e7f821a4e484239133712c6f2309c1fb7c2,efcc269f9b7f3b17d16e458b3cf749a3aa7aaea4..1eef780c98d951d1e3215089b7f3d46e4f63e3b8
@@@ -427,9 -499,17 +436,17 @@@ int arch_set_info_guest
      ed->arch.event_address     = c->event_callback_eip;
      ed->arch.failsafe_selector = c->failsafe_callback_cs;
      ed->arch.failsafe_address  = c->failsafe_callback_eip;
+ #elif defined(__x86_64__)
+     ed->arch.event_address     = c->event_callback_eip;
+     ed->arch.failsafe_address  = c->failsafe_callback_eip;
+     ed->arch.syscall_address   = c->syscall_callback_eip;
+ #endif
+     if ( ed->eid == 0 )
+         d->vm_assist = c->vm_assist;
  
      phys_basetab = c->pt_base;
 -    ed->arch.guest_table = ed->arch.phys_table = mk_pagetable(phys_basetab);
 +    ed->arch.guest_table = mk_pagetable(phys_basetab);
  
      if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d, 
                              PGT_base_page_table) )
Simple merge
index 04c3b61e23617e5c74e7334ab8dcb033ff403bb7,aca041e0c6e62a01f7c802d41116d9b8f3ca3997..cf49117025d103d8c893df61c0fdaf22619f1d84
  #define MEM_LOG(_f, _a...) ((void)0)
  #endif
  
 -static int alloc_l2_table(struct pfn_info *page);
 -static int alloc_l1_table(struct pfn_info *page);
 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 
 -                                         u32 type,
 -                                         struct domain *d);
 -
+ /*
+  * Both do_mmuext_op() and do_mmu_update():
+  * We steal the m.s.b. of the @count parameter to indicate whether this
+  * invocation of do_mmu_update() is resuming a previously preempted call.
+  */
+ #define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
  static void free_l2_table(struct pfn_info *page);
  static void free_l1_table(struct pfn_info *page);
  
@@@ -1198,16 -1170,6 +1199,13 @@@ int get_page_type(struct pfn_info *page
                      nx |= PGT_validated;
              }
          }
-             {
-                 rep_nop();
-                 barrier();
-             }
 +        else if ( unlikely(!(x & PGT_validated)) )
 +        {
 +            /* Someone else is updating validation of this page. Wait... */
 +            while ( (y = page->u.inuse.type_info) == x )
++                cpu_relax();
 +            goto again;
 +        }
          else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
          {
              if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
@@@ -1304,36 -1256,142 +1303,146 @@@ int new_guest_cr3(unsigned long mfn
      return okay;
  }
  
- static int do_extended_command(unsigned long ptr, unsigned long val)
+ static void process_deferred_ops(unsigned int cpu)
+ {
+     unsigned int deferred_ops;
+     deferred_ops = percpu_info[cpu].deferred_ops;
+     percpu_info[cpu].deferred_ops = 0;
+     if ( deferred_ops & DOP_FLUSH_TLB )
++    {
++        if ( shadow_mode_enabled(d) )
++            shadow_sync_all(d);
+         local_flush_tlb();
++    }
+         
+     if ( deferred_ops & DOP_RELOAD_LDT )
+         (void)map_ldt_shadow_page(0);
+     if ( unlikely(percpu_info[cpu].foreign != NULL) )
+     {
+         put_domain(percpu_info[cpu].foreign);
+         percpu_info[cpu].foreign = NULL;
+     }
+ }
+ static int set_foreigndom(unsigned int cpu, domid_t domid)
+ {
+     struct domain *e, *d = current->domain;
+     int okay = 1;
+     if ( (e = percpu_info[cpu].foreign) != NULL )
+         put_domain(e);
+     percpu_info[cpu].foreign = NULL;
+     
+     if ( domid == DOMID_SELF )
+         goto out;
+     if ( !IS_PRIV(d) )
+     {
+         switch ( domid )
+         {
+         case DOMID_IO:
+             get_knownalive_domain(dom_io);
+             percpu_info[cpu].foreign = dom_io;
+             break;
+         default:
+             MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
+             okay = 0;
+             break;
+         }
+     }
+     else
+     {
+         percpu_info[cpu].foreign = e = find_domain_by_id(domid);
+         if ( e == NULL )
+         {
+             switch ( domid )
+             {
+             case DOMID_XEN:
+                 get_knownalive_domain(dom_xen);
+                 percpu_info[cpu].foreign = dom_xen;
+                 break;
+             case DOMID_IO:
+                 get_knownalive_domain(dom_io);
+                 percpu_info[cpu].foreign = dom_io;
+                 break;
+             default:
+                 MEM_LOG("Unknown domain '%u'", domid);
+                 okay = 0;
+                 break;
+             }
+         }
+     }
+  out:
+     return okay;
+ }
+ static inline unsigned long vcpuset_to_pcpuset(
+     struct domain *d, unsigned long vset)
+ {
+     unsigned int  vcpu;
+     unsigned long pset = 0;
+     struct exec_domain *ed;
+     while ( vset != 0 )
+     {
+         vcpu = find_first_set_bit(vset);
+         vset &= ~(1UL << vcpu);
+         if ( (vcpu < MAX_VIRT_CPUS) &&
+              ((ed = d->exec_domain[vcpu]) != NULL) )
+             pset |= 1UL << ed->processor;
+     }
+     return pset;
+ }
+ int do_mmuext_op(
+     struct mmuext_op *uops,
+     unsigned int count,
+     unsigned int *pdone,
+     unsigned int foreigndom)
  {
-     int okay = 1, cpu = smp_processor_id();
-     unsigned int cmd = val & MMUEXT_CMD_MASK, type;
+     struct mmuext_op op;
+     int rc = 0, i = 0, okay, cpu = smp_processor_id();
+     unsigned int type, done = 0;
+     struct pfn_info *page;
      struct exec_domain *ed = current;
      struct domain *d = ed->domain, *e;
-     unsigned long mfn = ptr >> PAGE_SHIFT;
-     struct pfn_info *page = &frame_table[mfn];
      u32 x, y, _d, _nd;
-     domid_t domid;
-     grant_ref_t gntref;
  
-     switch ( cmd )
+     LOCK_BIGLOCK(d);
+     cleanup_writable_pagetable(d);
+     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
      {
-     case MMUEXT_PIN_L1_TABLE:
-         /*
-          * We insist that, if you pin an L1 page, it's the first thing that
-          * you do to it. This is because we require the backptr to still be
-          * mutable. This assumption seems safe.
-          */
-         type = PGT_l1_page_table | PGT_va_mutable;
+         count &= ~MMU_UPDATE_PREEMPTED;
+         if ( unlikely(pdone != NULL) )
+             (void)get_user(done, pdone);
+     }
+     if ( !set_foreigndom(cpu, foreigndom) )
+     {
+         rc = -EINVAL;
+         goto out;
+     }
  
-     pin_page:
-         if ( shadow_mode_enabled(FOREIGNDOM) )
-             type = PGT_writable_page;
+     if ( unlikely(!array_access_ok(VERIFY_READ, uops, count, sizeof(op))) )
+     {
+         rc = -EFAULT;
+         goto out;
+     }
  
-         okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
-         if ( unlikely(!okay) )
+     for ( i = 0; i < count; i++ )
+     {
+         if ( hypercall_preempt_check() )
          {
-             MEM_LOG("Error while pinning mfn %p", mfn);
+             rc = hypercall4_create_continuation(
+                 __HYPERVISOR_mmuext_op, uops,
+                 (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
              break;
          }
  
@@@ -1728,33 -1696,22 +1745,22 @@@ int do_mmu_update
      cleanup_writable_pagetable(d);
  
      if ( unlikely(shadow_mode_enabled(d)) )
 -        check_pagetable(d, ed->arch.guest_table, "pre-mmu"); /* debug */
 +        check_pagetable(ed, "pre-mmu"); /* debug */
  
-     /*
-      * If we are resuming after preemption, read how much work we have already
-      * done. This allows us to set the @done output parameter correctly.
-      * We also reset FOREIGNDOM here.
-      */
-     if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
+     if ( unlikely(shadow_mode_translate(d)) )
+         domain_crash_synchronous();
+     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
      {
-         if ( !(count & MMU_UPDATE_PREEMPTED) )
-         {
-             /* Count overflow into private FOREIGNDOM field. */
-             MEM_LOG("do_mmu_update count is too large");
-             rc = -EINVAL;
-             goto out;
-         }
          count &= ~MMU_UPDATE_PREEMPTED;
-         domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
-         count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
          if ( unlikely(pdone != NULL) )
              (void)get_user(done, pdone);
-         if ( (domid != current->domain->id) &&
-              !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
-         {
-             rc = -EINVAL;
-             goto out;
-         }
+     }
+     if ( !set_foreigndom(cpu, foreigndom) )
+     {
+         rc = -EINVAL;
+         goto out;
      }
  
      perfc_incrc(calls_to_mmu_update); 
              okay = 1;
  
              /*
 -             * If in log-dirty mode, mark the corresponding pseudo-physical
 +             * If in log-dirty mode, mark the corresponding
               * page as dirty.
               */
 -            if ( unlikely(shadow_mode_log_dirty(d)) && 
 -                 mark_dirty(d, pfn) )
 -                d->arch.shadow_dirty_block_count++;
 +            if ( unlikely(shadow_mode_log_dirty(FOREIGNDOM)) &&
 +                 mark_dirty(FOREIGNDOM, mfn) )
 +                FOREIGNDOM->arch.shadow_dirty_block_count++;
  
 -            put_page(&frame_table[pfn]);
 +            put_page(&frame_table[mfn]);
              break;
  
-             /*
-              * MMU_EXTENDED_COMMAND: Extended command is specified
-              * in the least-siginificant bits of the 'value' field.
-              */
-         case MMU_EXTENDED_COMMAND:
-             req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
-             okay = do_extended_command(req.ptr, req.val);
-             break;
          default:
              MEM_LOG("Invalid page update command %p", req.ptr);
              break;
      }
  
   out:
 -    if ( prev_pfn != 0 )
 +    if ( prev_mfn != 0 )
          unmap_domain_mem((void *)va);
  
-     deferred_ops = percpu_info[cpu].deferred_ops;
-     percpu_info[cpu].deferred_ops = 0;
-     if ( deferred_ops & DOP_FLUSH_TLB )
-     {
-         if ( shadow_mode_enabled(d) )
-             shadow_sync_all(d);
-         local_flush_tlb();
-     }
-         
-     if ( deferred_ops & DOP_RELOAD_LDT )
-         (void)map_ldt_shadow_page(0);
 -    if ( unlikely(prev_spl1e != 0) ) 
 -        unmap_domain_mem((void *)prev_spl1e);
--
-     if ( unlikely(percpu_info[cpu].foreign != NULL) )
-     {
-         put_domain(percpu_info[cpu].foreign);
-         percpu_info[cpu].foreign = NULL;
-     }
+     process_deferred_ops(cpu);
  
      /* Add incremental work we have done to the @done output parameter. */
      if ( unlikely(pdone != NULL) )
@@@ -2113,48 -2031,53 +2091,72 @@@ int do_update_va_mapping(unsigned long 
  
      cleanup_writable_pagetable(d);
  
-     /*
-      * XXX When we make this support 4MB superpages we should also deal with 
-      * the case of updating L2 entries.
-      */
 -    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
 -                                mk_l1_pgentry(val))) )
 -        rc = -EINVAL;
 -
      if ( unlikely(shadow_mode_enabled(d)) )
 -        update_shadow_va_mapping(va, val, ed, d);
 +    {
 +        if ( unlikely(percpu_info[cpu].foreign &&
 +                      (shadow_mode_translate(d) ||
 +                       shadow_mode_translate(percpu_info[cpu].foreign))) )
 +        {
 +            // The foreign domain's pfn's are in a different namespace.
 +            // There's not enough information in just a gpte to figure out
 +            // how to (re-)shadow this entry.
 +            //
 +            domain_crash();
 +        }
 +    
 +        rc = update_shadow_va_mapping(va, val, ed, d);
 +    }
 +    else if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
 +                                     mk_l1_pgentry(val))) )
 +        rc = -EINVAL;
  
-     deferred_ops = percpu_info[cpu].deferred_ops;
-     percpu_info[cpu].deferred_ops = 0;
-     if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
-          unlikely(flags & UVMF_FLUSH_TLB) )
-     {
-         if ( unlikely(shadow_mode_enabled(d)) )
-             shadow_sync_all(d);
-         local_flush_tlb();
-     }
-     else if ( unlikely(flags & UVMF_INVLPG) )
+     switch ( flags & UVMF_FLUSHTYPE_MASK )
      {
-         if ( unlikely(shadow_mode_enabled(d)) )
-             shadow_invlpg(current, va);
-         __flush_tlb_one(va);
+     case UVMF_TLB_FLUSH:
+         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
+         {
+         case UVMF_LOCAL:
++            if ( unlikely(shadow_mode_enabled(d)) )
++                shadow_sync_all(d);
+             local_flush_tlb();
+             break;
+         case UVMF_ALL:
++            BUG_ON(shadow_mode_enabled(d));
+             flush_tlb_mask(d->cpuset);
+             break;
+         default:
+             if ( unlikely(get_user(vset, (unsigned long *)bmap_ptr)) )
+                 rc = -EFAULT;
+             pset = vcpuset_to_pcpuset(d, vset);
+             flush_tlb_mask(pset & d->cpuset);
+             break;
+         }
+         break;
+     case UVMF_INVLPG:
+         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
+         {
+         case UVMF_LOCAL:
++            if ( unlikely(shadow_mode_enabled(d)) )
++                shadow_invlpg(current, va);
+             local_flush_tlb_one(va);
+             break;
+         case UVMF_ALL:
++            BUG_ON(shadow_mode_enabled(d));
+             flush_tlb_one_mask(d->cpuset, va);
+             break;
+         default:
+             if ( unlikely(get_user(vset, (unsigned long *)bmap_ptr)) )
+                 rc = -EFAULT;
+             pset = vcpuset_to_pcpuset(d, vset);
++            BUG_ON(shadow_mode_enabled(d) && (pset != (1<<cpu)));
+             flush_tlb_one_mask(pset & d->cpuset, va);
+             break;
+         }
+         break;
      }
  
-     if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
-         (void)map_ldt_shadow_page(0);
+     process_deferred_ops(cpu);
      
      UNLOCK_BIGLOCK(d);
  
@@@ -2300,24 -2218,20 +2299,22 @@@ long do_set_gdt(unsigned long *frame_li
  }
  
  
- long do_update_descriptor(
-     unsigned long pa, unsigned long word1, unsigned long word2)
+ long do_update_descriptor(unsigned long pa, u64 desc)
  {
 -    unsigned long pfn = pa >> PAGE_SHIFT;
 +    struct domain *dom = current->domain;
 +    unsigned long gpfn = pa >> PAGE_SHIFT;
 +    unsigned long mfn;
      struct desc_struct *gdt_pent, d;
      struct pfn_info *page;
      struct exec_domain *ed;
      long ret = -EINVAL;
  
-     d.a = (u32)word1;
-     d.b = (u32)word2;
+     *(u64 *)&d = desc;
  
 -    LOCK_BIGLOCK(current->domain);
 +    LOCK_BIGLOCK(dom);
  
 -    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
 -        UNLOCK_BIGLOCK(current->domain);
 +    if ( !VALID_MFN(mfn = __gpfn_to_mfn(dom, gpfn)) ) {
 +        UNLOCK_BIGLOCK(dom);
          return -EINVAL;
      }
  
@@@ -2427,10 -2319,10 +2424,10 @@@ void ptwr_flush(const int which
  
      if ( unlikely(__get_user(pte, ptep)) )
      {
 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 +        MEM_LOG("ptwr: Could not read pte at %p", ptep);
          /*
           * Really a bug. We could read this PTE during the initial fault,
-          * and pagetables can't have changed meantime. XXX Multi-CPU guests?
+          * and pagetables can't have changed meantime.
           */
          BUG();
      }
      /* Write-protect the p.t. page in the guest page table. */
      if ( unlikely(__put_user(pte, ptep)) )
      {
 -        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
 +        MEM_LOG("ptwr: Could not update pte at %p", ptep);
          /*
           * Really a bug. We could write this PTE during the initial fault,
-          * and pagetables can't have changed meantime. XXX Multi-CPU guests?
+          * and pagetables can't have changed meantime.
           */
          BUG();
      }
               */
              memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
                     (L1_PAGETABLE_ENTRIES - i) * sizeof(l1_pgentry_t));
-             unmap_domain_mem(pl1e);
-             ptwr_info[cpu].ptinfo[which].l1va = 0;
              domain_crash();
+             break;
          }
          
 -        if ( unlikely(sl1e != NULL) )
 -            l1pte_propagate_from_guest(
 -                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
 -
          put_page_from_l1e(ol1e, d);
      }
      unmap_domain_mem(pl1e);
@@@ -2754,10 -2671,11 +2750,10 @@@ int ptwr_do_page_fault(unsigned long ad
      ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
      
      /* For safety, disconnect the L1 p.t. page from current space. */
 -    if ( (which == PTWR_PT_ACTIVE) && 
 -         likely(!shadow_mode_enabled(ed->domain)) )
 +    if ( which == PTWR_PT_ACTIVE )
      {
          *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
-         flush_tlb(); /* XXX Multi-CPU guests? */
+         local_flush_tlb(); /* XXX Multi-CPU guests? */
      }
      
      /* Temporarily map the L1 page, and make a copy of it. */
@@@ -2814,43 -2733,556 +2811,96 @@@ __initcall(ptwr_init)
  /************************************************************************/
  /************************************************************************/
  
--#ifndef NDEBUG
 -
 -void audit_pagelist(struct domain *d)
 -{
 -    struct list_head *list_ent;
 -    int xenpages, totpages;
 -
 -    list_ent = d->xenpage_list.next;
 -    for ( xenpages = 0; (list_ent != &d->xenpage_list); xenpages++ )
 -    {
 -        list_ent = list_ent->next;
 -    }
 -    list_ent = d->page_list.next;
 -    for ( totpages = 0; (list_ent != &d->page_list); totpages++ )
 -    {
 -        list_ent = list_ent->next;
 -    }
 -
 -    if ( xenpages != d->xenheap_pages ||
 -         totpages != d->tot_pages )
 -    {
 -        printk("ARGH! dom %d: xen=%d %d, pages=%d %d\n",
 -               xenpages, d->xenheap_pages, 
 -               totpages, d->tot_pages );
 -    }
 -}
 -
 -void audit_domain(struct domain *d)
 -{
 -    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
 -
 -    void adjust (struct pfn_info *page, int dir, int adjtype)
 -    {
 -        int count = page->count_info & PGC_count_mask;
 -
 -        if ( adjtype )
 -        {
 -            int tcount = page->u.inuse.type_info & PGT_count_mask;
 -            
 -            ttot++;
 -
 -            tcount += dir;
 -
 -            if ( tcount < 0 )
 -            {
 -                /* This will only come out once. */
 -                printk("Audit %d: type count whent below zero pfn=%x "
 -                       "taf=%x otaf=%x\n",
 -                       d->id, page-frame_table,
 -                       page->u.inuse.type_info,
 -                       page->tlbflush_timestamp);
 -            }
 -            
 -            page->u.inuse.type_info =
 -                (page->u.inuse.type_info & ~PGT_count_mask) | 
 -                (tcount & PGT_count_mask);
 -        }
 -
 -        ctot++;
 -        count += dir;
 -        if ( count < 0 )
 -        {
 -            /* This will only come out once. */
 -            printk("Audit %d: general count whent below zero pfn=%x "
 -                   "taf=%x otaf=%x\n",
 -                   d->id, page-frame_table,
 -                   page->u.inuse.type_info,
 -                   page->tlbflush_timestamp);
 -        }
 -            
 -        page->count_info =
 -            (page->count_info & ~PGC_count_mask) | 
 -            (count & PGC_count_mask);            
 -
 -    }
 -
 -    void scan_for_pfn(struct domain *d, unsigned long xpfn)
 -    {
 -        unsigned long pfn, *pt;
 -        struct list_head *list_ent;
 -        struct pfn_info *page;
 -        int i;
 -
 -        list_ent = d->page_list.next;
 -        for ( i = 0; (list_ent != &d->page_list); i++ )
 -        {
 -            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 -            page = &frame_table[pfn];
 -            
 -            switch ( page->u.inuse.type_info & PGT_type_mask )
 -            {
 -            case PGT_l1_page_table:
 -            case PGT_l2_page_table:
 -                pt = map_domain_mem(pfn<<PAGE_SHIFT);
 -                for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
 -                    if ( (pt[i] & _PAGE_PRESENT) &&
 -                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
 -                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
 -                               d->id, i, pfn, page->u.inuse.type_info,
 -                               page->count_info);
 -                unmap_domain_mem(pt);           
 -            }
 -
 -            list_ent = frame_table[pfn].list.next;
 -        }
 -
 -    }
 -
 -    void scan_for_pfn_remote(unsigned long xpfn)
 -    {
 -        struct domain *e;
 -        for_each_domain ( e )
 -            scan_for_pfn( e, xpfn );            
 -    }   
 -
 -    int i, l1, l2;
 -    unsigned long pfn;
 -    struct list_head *list_ent;
 -    struct pfn_info *page;
 -
 -    if ( d != current->domain )
 -        domain_pause(d);
 -
 -    sync_lazy_execstate_all();
 -
 -    printk("pt base=%lx sh_info=%x\n",
 -           pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT,
 -           virt_to_page(d->shared_info)-frame_table);
 -           
 -    spin_lock(&d->page_alloc_lock);
 -
 -    audit_pagelist(d);
 -
 -    /* PHASE 0 */
 -
 -    list_ent = d->page_list.next;
 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 -    {
 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 -        page = &frame_table[pfn];
 -
 -        BUG_ON(page_get_owner(page) != d);
 -
 -        if ( (page->u.inuse.type_info & PGT_count_mask) >
 -             (page->count_info & PGC_count_mask) )
 -            printk("taf > caf %x %x pfn=%lx\n",
 -                   page->u.inuse.type_info, page->count_info, pfn );
 - 
 -#if 0   /* SYSV shared memory pages plus writeable files. */
 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 -        {
 -            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
 -                  pfn,
 -                  page->u.inuse.type_info,
 -                  page->count_info );
 -            scan_for_pfn_remote(pfn);
 -        }
 -#endif
 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 -        {
 -            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
 -                  pfn,
 -                  page->u.inuse.type_info,
 -                  page->count_info );
 -        }
 -
 -        /* Use tlbflush_timestamp to store original type_info. */
 -        page->tlbflush_timestamp = page->u.inuse.type_info;
 -
 -        list_ent = frame_table[pfn].list.next;
 -    }
 -
 -
 -    /* PHASE 1 */
 -    if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
 -        adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.guest_table)
 -                           >>PAGE_SHIFT], -1, 1);
 -
 -    list_ent = d->page_list.next;
 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 -    {
 -        unsigned long *pt;
 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 -        page = &frame_table[pfn];
 -
 -        BUG_ON(page_get_owner(page) != d);
 -
 -        switch ( page->u.inuse.type_info & PGT_type_mask )
 -        {
 -        case PGT_l2_page_table:
 -
 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 -                printk("Audit %d: L2 not validated %x\n",
 -                       d->id, page->u.inuse.type_info);
 -
 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 -                printk("Audit %d: L2 not pinned %x\n",
 -                       d->id, page->u.inuse.type_info);
 -            else
 -                adjust( page, -1, 1 );
 -           
 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 -
 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 -            {
 -                if ( pt[i] & _PAGE_PRESENT )
 -                {
 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 -                    struct pfn_info *l1page = &frame_table[l1pfn];
 -
 -                    if ( page_get_owner(l1page) != d )
 -                    {
 -                        printk("L2: Skip bizarre page belonging to other "
 -                               "dom %p\n", page_get_owner(l1page));
 -                        continue;
 -                    }
 -                    
 -                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 -                         PGT_l2_page_table )
 -                        printk("Audit %d: [%x] Found %s Linear PT "
 -                               "t=%x pfn=%lx\n", d->id, i, 
 -                               (l1pfn==pfn) ? "Self" : "Other",
 -                               l1page->u.inuse.type_info,
 -                               l1pfn);
 -                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
 -                              PGT_l1_page_table )
 -                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
 -                               d->id, i,
 -                               l1page->u.inuse.type_info,
 -                               l1pfn);
 -
 -                    adjust(l1page, -1, 1);
 -                }
 -            }
 -
 -            unmap_domain_mem(pt);
 -
 -            break;
 -
 -
 -        case PGT_l1_page_table:
 -            
 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 -                adjust( page, -1, 1 );
 -
 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 -                printk("Audit %d: L1 not validated %x\n",
 -                       d->id, page->u.inuse.type_info);
 -#if 0
 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 -                printk("Audit %d: L1 not pinned %x\n",
 -                       d->id, page->u.inuse.type_info);
 -#endif
 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 -
 -            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
 -            {
 -                if ( pt[i] & _PAGE_PRESENT )
 -                {
 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 -                    struct pfn_info *l1page = &frame_table[l1pfn];
 -
 -                    if ( l1pfn < 0x100 )
 -                    {
 -                        lowmem_mappings++;
 -                        continue;
 -                    }
 -
 -                    if ( l1pfn > max_page )
 -                    {
 -                        io_mappings++;
 -                        continue;
 -                    }
 -
 -                    if ( pt[i] & _PAGE_RW )
 -                    {
 -
 -                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 -                             PGT_l1_page_table ||
 -                             (l1page->u.inuse.type_info & PGT_type_mask) ==
 -                             PGT_l2_page_table )
 -                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
 -                                   d->id, i,
 -                                   l1page->u.inuse.type_info,
 -                                   l1pfn);
 -
 -                    }
 -
 -                    if ( page_get_owner(l1page) != d )
 -                    {
 -                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
 -                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
 -                               d->id, pfn, i,
 -                               page_get_owner(l1page),
 -                               l1pfn,
 -                               l1page->count_info,
 -                               l1page->u.inuse.type_info,
 -                               machine_to_phys_mapping[l1pfn]);    
 -                        continue;
 -                    }
 -
 -                    adjust(l1page, -1, 0);
 -                }
 -            }
 -
 -            unmap_domain_mem(pt);
 -
 -            break;
 -        }       
 -
 -        list_ent = frame_table[pfn].list.next;
 -    }
 -
 -    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
 -        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
 -               d->id, lowmem_mappings, io_mappings);
 -
 -    /* PHASE 2 */
 -
 -    ctot = ttot = 0;
 -    list_ent = d->page_list.next;
 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 -    {
 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 -        page = &frame_table[pfn];
 -
 -        switch ( page->u.inuse.type_info & PGT_type_mask)
 -        {
 -        case PGT_l1_page_table:
 -        case PGT_l2_page_table:
 -            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
 -            {
 -                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
 -                       d->id, page->u.inuse.type_info, 
 -                       page->tlbflush_timestamp,
 -                       page->count_info, pfn );
 -                scan_for_pfn_remote(pfn);
 -            }
 -        default:
 -            if ( (page->count_info & PGC_count_mask) != 1 )
 -            {
 -                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
 -                       d->id, 
 -                       page->count_info,
 -                       page->u.inuse.type_info, 
 -                       page->tlbflush_timestamp, pfn );
 -                scan_for_pfn_remote(pfn);
 -            }
 -            break;
 -        }
 -
 -        list_ent = frame_table[pfn].list.next;
 -    }
 -
 -    /* PHASE 3 */
 -    list_ent = d->page_list.next;
 -    l1 = l2 = 0;
 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 -    {
 -        unsigned long *pt;
 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 -        page = &frame_table[pfn];
 -
 -        switch ( page->u.inuse.type_info & PGT_type_mask )
 -        {
 -        case PGT_l2_page_table:
 -          l2++;
 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 -                adjust( page, 1, 1 );          
 -
 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 -
 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 -            {
 -                if ( pt[i] & _PAGE_PRESENT )
 -                {
 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 -                    struct pfn_info *l1page;
 -
 -                    if (l1pfn>max_page)
 -                        continue;
 -
 -                    l1page = &frame_table[l1pfn];
 -
 -                    if ( page_get_owner(l1page) == d )
 -                        adjust(l1page, 1, 1);
 -                }
 -            }
 -
 -            unmap_domain_mem(pt);
 -            break;
 -
 -        case PGT_l1_page_table:
 -          l1++;
 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 -                adjust( page, 1, 1 );
 -
 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 -
 -            for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
 -            {
 -                if ( pt[i] & _PAGE_PRESENT )
 -                {
 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 -                    struct pfn_info *l1page;
 -
 -                    if (l1pfn>max_page)
 -                        continue;
 -
 -                    l1page = &frame_table[l1pfn];
 -
 -                    if ( (page_get_owner(l1page) != d) ||
 -                         (l1pfn < 0x100) || (l1pfn > max_page) )
 -                        continue;
 -
 -                    adjust(l1page, 1, 0);
 -                }
 -            }
 -
 -            unmap_domain_mem(pt);
 -            break;
 -        }
 -
 -
 -        page->tlbflush_timestamp = 0;
 -
 -        list_ent = frame_table[pfn].list.next;
 -    }
 -
 -
 -    if ( pagetable_val(d->exec_domain[0]->arch.guest_table) )
 -        adjust(&frame_table[pagetable_val(
 -            d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT], 1, 1);
 -
 -    spin_unlock(&d->page_alloc_lock);
 -    printk("Audit %d: Done. ref=%d xenpages=%d pages=%d l1=%d"
 -           " l2=%d ctot=%d ttot=%d\n", 
 -           d->id, atomic_read(&d->refcnt), d->xenheap_pages, d->tot_pages,
 -           l1, l2, ctot, ttot );
 -
 -    if ( d != current->domain )
 -        domain_unpause(d);
 -}
 -
 -void audit_domains(void)
 -{
 -    struct domain *d;
 -    for_each_domain ( d )
 -        audit_domain(d);
 -}
 -
 -void audit_domains_key(unsigned char key)
 -{
 -    audit_domains();
 -}
 -
 -#endif /* NDEBUG */
 -
+ /* Graveyard: stuff below may be useful in future. */
+ #if 0
+     case MMUEXT_TRANSFER_PAGE:
+         domid  = (domid_t)(val >> 16);
+         gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
+         
+         if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
+              unlikely(!pfn_is_ram(pfn)) ||
+              unlikely((e = find_domain_by_id(domid)) == NULL) )
+         {
+             MEM_LOG("Bad frame (%p) or bad domid (%d).\n", pfn, domid);
+             okay = 0;
+             break;
+         }
+         spin_lock(&d->page_alloc_lock);
  
- void ptwr_status(void)
- {
-     unsigned long pte, *ptep, pfn;
-     struct pfn_info *page;
-     int cpu = smp_processor_id();
+         /*
+          * The tricky bit: atomically release ownership while there is just one
+          * benign reference to the page (PGC_allocated). If that reference
+          * disappears then the deallocation routine will safely spin.
+          */
+         _d  = pickle_domptr(d);
+         _nd = page->u.inuse._domain;
+         y   = page->count_info;
+         do {
+             x = y;
+             if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
+                           (1|PGC_allocated)) ||
+                  unlikely(_nd != _d) )
+             {
+                 MEM_LOG("Bad page values %p: ed=%p(%u), sd=%p,"
+                         " caf=%08x, taf=%08x\n", page_to_pfn(page),
+                         d, d->id, unpickle_domptr(_nd), x, 
+                         page->u.inuse.type_info);
+                 spin_unlock(&d->page_alloc_lock);
+                 put_domain(e);
+                 return 0;
+             }
+             __asm__ __volatile__(
+                 LOCK_PREFIX "cmpxchg8b %2"
+                 : "=d" (_nd), "=a" (y),
+                 "=m" (*(volatile u64 *)(&page->count_info))
+                 : "0" (_d), "1" (x), "c" (NULL), "b" (x) );
+         } 
+         while ( unlikely(_nd != _d) || unlikely(y != x) );
  
-     ptep = (unsigned long *)&linear_pg_table
-         [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
+         /*
+          * Unlink from 'd'. At least one reference remains (now anonymous), so
+          * noone else is spinning to try to delete this page from 'd'.
+          */
+         d->tot_pages--;
+         list_del(&page->list);
+         
+         spin_unlock(&d->page_alloc_lock);
  
-     if ( __get_user(pte, ptep) ) {
-         MEM_LOG("ptwr: Could not read pte at %p", ptep);
-         domain_crash();
-     }
+         spin_lock(&e->page_alloc_lock);
  
-     pfn = pte >> PAGE_SHIFT;
-     page = &frame_table[pfn];
-     printk("need to alloc l1 page %p\n", page);
-     /* make pt page writable */
-     printk("need to make read-only l1-page at %p is %p\n",
-            ptep, pte);
+         /*
+          * Check that 'e' will accept the page and has reservation headroom.
+          * Also, a domain mustn't have PGC_allocated pages when it is dying.
+          */
+         ASSERT(e->tot_pages <= e->max_pages);
+         if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
+              unlikely(e->tot_pages == e->max_pages) ||
+              unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
+         {
+             MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
+                     "provided a bad grant ref, or is dying (%p).\n",
+                     e->tot_pages, e->max_pages, e->d_flags);
+             spin_unlock(&e->page_alloc_lock);
+             put_domain(e);
+             okay = 0;
+             break;
+         }
  
-     if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
-         return;
+         /* Okay, add the page to 'e'. */
+         if ( unlikely(e->tot_pages++ == 0) )
+             get_knownalive_domain(e);
+         list_add_tail(&page->list, &e->page_list);
+         page_set_owner(page, e);
  
-     if ( __get_user(pte, (unsigned long *)
-                     ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
-         MEM_LOG("ptwr: Could not read pte at %p", (unsigned long *)
-                 ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
-         domain_crash();
-     }
-     pfn = pte >> PAGE_SHIFT;
-     page = &frame_table[pfn];
- }
+         spin_unlock(&e->page_alloc_lock);
  
- #endif /* NDEBUG */
+         /* Transfer is all done: tell the guest about its new page frame. */
+         gnttab_notify_transfer(e, gntref, pfn);
+         
+         put_domain(e);
+         break;
+ #endif
  
  /*
   * Local variables:
index 910664d7824096b61a911c6510da263c02c731b3,e47eccf8eeb521c94cb5704256651247ae6b2474..9db6c7616d4800d83860ccf07c54d38309f4d5dd
@@@ -2263,51 -613,54 +2262,49 @@@ void __shadow_sync_all(struct domain *d
  
  int shadow_fault(unsigned long va, struct xen_regs *regs)
  {
 -    unsigned long gpte, spte = 0;
 +    unsigned long gpte, spte = 0, orig_gpte;
      struct exec_domain *ed = current;
      struct domain *d = ed->domain;
 +    unsigned long gpde;
  
      SH_VVLOG("shadow_fault( va=%p, code=%lu )", va, regs->error_code );
 -
 -    check_pagetable(d, ed->arch.guest_table, "pre-sf");
 +    perfc_incrc(shadow_fault_calls);
 +    
 +    check_pagetable(ed, "pre-sf");
  
      /*
 -     * STEP 1. A fast-reject set of checks with no locking.
 +     * Don't let someone else take the guest's table pages out-of-sync.
       */
 +    shadow_lock(d);
  
 -    if ( unlikely(__get_user(gpte, (unsigned long *)
 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
 -    {
 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
 -        return 0;
 -    }
 -
 -    if ( !(gpte & _PAGE_PRESENT) )
 -    {
 -        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
 -        return 0;
 -    }
 -
 -    if ( (regs->error_code & 2)  && !(gpte & _PAGE_RW) )
 -    {
 -        /* Write fault on a read-only mapping. */
 -        return 0;
 -    }
 +    /* XXX - FIX THIS COMMENT!!!
 +     * STEP 1. Check to see if this fault might have been caused by an
 +     *         out-of-sync table page entry, or if we should pass this
 +     *         fault onto the guest.
 +     */
 +    __shadow_sync_va(ed, va);
  
      /*
 -     * STEP 2. Take the shadow lock and re-check the guest PTE.
 +     * STEP 2. Check the guest PTE.
       */
 -
 -    shadow_lock(d);
 - 
 -    if ( unlikely(__get_user(gpte, (unsigned long *)
 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
 +    __guest_get_l2e(ed, va, &gpde);
 +    if ( unlikely(!(gpde & _PAGE_PRESENT)) )
      {
 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted2" );
 +        SH_VVLOG("shadow_fault - EXIT: L1 not present" );
 +        perfc_incrc(shadow_fault_bail_pde_not_present);
-         shadow_unlock(d);
-         return 0;
+         goto fail;
      }
  
 +    // This can't fault because we hold the shadow lock and we've ensured that
 +    // the mapping is in-sync, so the check of the PDE's present bit, above,
 +    // covers this access.
 +    //
 +    orig_gpte = gpte = l1_pgentry_val(linear_pg_table[l1_linear_offset(va)]);
      if ( unlikely(!(gpte & _PAGE_PRESENT)) )
      {
 -        SH_VVLOG("shadow_fault - EXIT: gpte not present2 (%lx)",gpte );
 +        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
 +        perfc_incrc(shadow_fault_bail_pte_not_present);
-         shadow_unlock(d);
-         return 0;
+         goto fail;
      }
  
      /* Write fault? */
          {
              /* Write fault on a read-only mapping. */
              SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%lx)", gpte);
-             shadow_unlock(d);
-             return 0;
 +            perfc_incrc(shadow_fault_bail_ro_mapping);
+             goto fail;
          }
  
 -        l1pte_write_fault(d, &gpte, &spte);
 +        if ( !l1pte_write_fault(ed, &gpte, &spte, va) )
 +        {
 +            SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed");
 +            perfc_incrc(write_fault_bail);
 +            shadow_unlock(d);
 +            return 0;
 +        }
      }
      else
      {
  
      /* XXX Watch out for read-only L2 entries! (not used in Linux). */
      if ( unlikely(__put_user(gpte, (unsigned long *)
 -                             &linear_pg_table[va >> PAGE_SHIFT])) )
 +                             &linear_pg_table[l1_linear_offset(va)])) )
      {
-         printk("shadow_fault(): crashing domain %d "
 -        domain_crash();
 -        goto fail;
++        printk("shadow_fault() failed, crashing domain %d "
 +               "due to a read-only L2 page table (gpde=%p), va=%p\n",
 +               d->id, gpde, va);
-         domain_crash();
++        domain_crash_synchronous();
      }
  
 -    /*
 -     * Update of shadow PTE can fail because the L1 p.t. is not shadowed,
 -     * or because the shadow isn't linked into this shadow L2 p.t.
 -     */
 -    if ( unlikely(__put_user(spte, (unsigned long *)
 -                             &shadow_linear_pg_table[va >> PAGE_SHIFT])) )
 -    {
 -        SH_VVLOG("3: not shadowed/mapped gpte=%p spte=%p", gpte, spte);
 -        shadow_map_l1_into_current_l2(va);
 -        shadow_linear_pg_table[va >> PAGE_SHIFT] = mk_l1_pgentry(spte);
 -    }
 +    // if necessary, record the page table page as dirty
 +    if ( unlikely(shadow_mode_log_dirty(d)) && (orig_gpte != gpte) )
 +        mark_dirty(d, __gpfn_to_mfn(d, gpde >> PAGE_SHIFT));
  
 -    perfc_incrc(shadow_fixup_count);
 +    shadow_set_l1e(va, spte, 1);
 +
 +    perfc_incrc(shadow_fault_fixed);
      d->arch.shadow_fault_count++;
  
      shadow_unlock(d);
  
 -    check_pagetable(d, ed->arch.guest_table, "post-sf");
 +    check_pagetable(ed, "post-sf");
      return EXCRET_fault_fixed;
+  fail:
+     shadow_unlock(d);
+     return 0;
  }
  
 -
 -void shadow_l1_normal_pt_update(
 -    unsigned long pa, unsigned long gpte,
 -    unsigned long *prev_smfn_ptr,
 -    l1_pgentry_t **prev_spl1e_ptr)
 +/*
 + * What lives where in the 32-bit address space in the various shadow modes,
 + * and what it uses to get/maintain that mapping.
 + *
 + * SHADOW MODE:      none         enable         translate         external
 + * 
 + * 4KB things:
 + * guest_vtable    lin_l2     mapped per gpdt  lin_l2 via hl2   mapped per gpdt
 + * shadow_vtable     n/a         sh_lin_l2       sh_lin_l2      mapped per gpdt
 + * hl2_vtable        n/a            n/a        lin_hl2 via hl2  mapped per gpdt
 + * monitor_vtable    n/a            n/a             n/a           mapped once
 + *
 + * 4MB things:
 + * guest_linear  lin via gpdt   lin via gpdt     lin via hl2      lin via hl2
 + * shadow_linear     n/a      sh_lin via spdt  sh_lin via spdt  sh_lin via spdt
 + * monitor_linear    n/a            n/a             n/a              ???
 + * perdomain      perdomain      perdomain       perdomain        perdomain
 + * R/O M2P         R/O M2P        R/O M2P           n/a              n/a
 + * R/W M2P         R/W M2P        R/W M2P         R/W M2P          R/W M2P
 + * P2M               n/a            n/a           R/O M2P          R/O M2P
 + *
 + * NB:
 + * update_pagetables(), __update_pagetables(), shadow_mode_enable(),
 + * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable()
 + * all play a part in maintaining these mappings.
 + */
 +void __update_pagetables(struct exec_domain *ed)
  {
 -    unsigned long smfn, spte, prev_smfn = *prev_smfn_ptr;    
 -    l1_pgentry_t *spl1e, *prev_spl1e = *prev_spl1e_ptr;
 +    struct domain *d = ed->domain;
 +    unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
 +    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
 +    unsigned long smfn, hl2mfn, old_smfn;
  
 -    /* N.B. To get here, we know the l1 page *must* be shadowed. */
 -    SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%p, "
 -             "prev_smfn=%p, prev_spl1e=%p",
 -             pa, gpte, prev_smfn, prev_spl1e);
 +    int max_mode = ( shadow_mode_external(d) ? SHM_external
 +                     : shadow_mode_translate(d) ? SHM_translate
 +                     : shadow_mode_enabled(d) ? SHM_enable
 +                     : 0 );
  
 -    smfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
 +    ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) );
 +    ASSERT( max_mode );
  
 -    if ( smfn == prev_smfn )
 -    {
 -        spl1e = prev_spl1e;
 -    }
 -    else
 +    /*
 +     *  arch.guest_vtable
 +     */
 +    if ( max_mode & (SHM_enable | SHM_external) )
      {
 -        if ( prev_spl1e != NULL )
 -            unmap_domain_mem( prev_spl1e );
 -        spl1e = (l1_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
 -        *prev_smfn_ptr  = smfn;
 -        *prev_spl1e_ptr = spl1e;
 +        if ( likely(ed->arch.guest_vtable != NULL) )
 +            unmap_domain_mem(ed->arch.guest_vtable);
 +        ed->arch.guest_vtable = map_domain_mem(gmfn << PAGE_SHIFT);
      }
  
 -    l1pte_propagate_from_guest(current->domain, &gpte, &spte);
 -    spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = mk_l1_pgentry(spte);
 -}
 -
 -void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde)
 -{
 -    unsigned long sl2mfn, spde = 0;
 -    l2_pgentry_t *spl2e;
 -    unsigned long sl1mfn;
 -
 -    /* N.B. To get here, we know the l2 page *must* be shadowed. */
 -    SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%p",pa,gpde);
 +    /*
 +     *  arch.shadow_table
 +     */
 +    if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) )
 +        smfn = shadow_l2_table(d, gpfn, gmfn);
 +    if ( !get_shadow_ref(smfn) )
 +        BUG();
 +    old_smfn = pagetable_val(ed->arch.shadow_table) >> PAGE_SHIFT;
 +    ed->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT);
 +    if ( old_smfn )
 +        put_shadow_ref(old_smfn);
  
 -    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
 +    SH_VVLOG("__update_pagetables(gmfn=%p, smfn=%p)", gmfn, smfn);
  
      /*
 -     * Only propagate to shadow if _PAGE_ACCESSED is set in the guest.
 -     * Otherwise, to ensure coherency, we blow away the existing shadow value.
 +     * arch.shadow_vtable
       */
 -    if ( gpde & _PAGE_ACCESSED )
 +    if ( max_mode == SHM_external )
      {
 -        sl1mfn = (gpde & _PAGE_PRESENT) ?
 -            __shadow_status(current->domain, gpde >> PAGE_SHIFT) : 0;
 -        l2pde_general(current->domain, &gpde, &spde, sl1mfn);
 +        if ( ed->arch.shadow_vtable )
 +            unmap_domain_mem(ed->arch.shadow_vtable);
 +        ed->arch.shadow_vtable = map_domain_mem(smfn << PAGE_SHIFT);
      }
  
 -    spl2e = (l2_pgentry_t *)map_domain_mem(sl2mfn << PAGE_SHIFT);
 -    spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spde);
 -    unmap_domain_mem(spl2e);
 -}
 +    /*
 +     * arch.hl2_vtable
 +     */
  
 -unsigned long mk_hl2_table(struct exec_domain *ed)
 -{
 -    struct domain *d = ed->domain;
 -    unsigned long gmfn = pagetable_val(ed->arch.guest_table) >> PAGE_SHIFT;
 -    unsigned long gpfn = __mfn_to_gpfn(d, gmfn);
 -    unsigned long hl2mfn, status;
 -    struct pfn_info *hl2_info;
 -    l1_pgentry_t *hl2;
 +    // if max_mode == SHM_translate, then the hl2 is already installed
 +    // correctly in its smfn, and there's nothing to do.
 +    //
 +    if ( max_mode == SHM_external )
 +    {
 +        if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) )
 +            hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn);
 +        if ( !get_shadow_ref(hl2mfn) )
 +            BUG();
  
 -    perfc_incr(hl2_table_pages);
 +        if ( ed->arch.hl2_vtable )
 +            unmap_domain_mem(ed->arch.hl2_vtable);
 +        ed->arch.hl2_vtable = map_domain_mem(hl2mfn << PAGE_SHIFT);
 +    }
  
 -    if ( (hl2_info = alloc_shadow_page(d)) == NULL )
 -        BUG(); /* XXX Deal gracefully with failure. */
 +    /*
 +     * fixup pointers in monitor table, as necessary
 +     */
 +    if ( max_mode == SHM_external )
 +    {
 +        l2_pgentry_t *mpl2e = ed->arch.monitor_vtable;
  
 -    hl2_info->u.inuse.type_info = PGT_l1_page_table;
 +        ASSERT( shadow_mode_translate(d) );
  
 -    hl2mfn = page_to_pfn(hl2_info);
 -    status = hl2mfn | PSH_hl2;
 -    set_shadow_status(ed->domain, gpfn | PSH_hl2, status);
 +        BUG(); // ref counts for hl2mfn and smfn need to be maintained!
  
 -    // need to optimize this...
 -    hl2 = map_domain_mem(hl2mfn << PAGE_SHIFT);
 -    memset(hl2, 0, PAGE_SIZE);
 -    unmap_domain_mem(hl2);
 +        mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] =
 +            mk_l2_pgentry((hl2mfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  
 -    return status;
 -}
 +        mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
 +            mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  
 +        // XXX - maybe this can be optimized somewhat??
 +        local_flush_tlb();
 +    }
 +}
  
  
  /************************************************************************/
Simple merge
Simple merge
Simple merge
Simple merge
index 798516a5a2d5279e82fa2436d4e2e2d6be3e8d57,0fa705dff0e50c6d0124f3379d6a29a462c43093..d6794422f6646a947889c588acb2ae5c390153bb
@@@ -558,30 -558,11 +559,27 @@@ void free_domheap_pages(struct pfn_inf
          /* NB. May recursively lock from domain_relinquish_memory(). */
          spin_lock_recursive(&d->page_alloc_lock);
  
-         for_each_exec_domain ( d, ed )
-             cpu_mask |= 1 << ed->processor;
          for ( i = 0; i < (1 << order); i++ )
          {
 +            if ( ((pg[i].u.inuse.type_info & PGT_count_mask) != 0) &&
 +                shadow_mode_enabled(d) )
 +            {
 +                // XXX This needs more thought...
 +                //
 +                printk("%s: needing to call shadow_remove_all_access for mfn=%p\n",
 +                       __func__, page_to_pfn(&pg[i]));
 +                printk("Amfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]),
 +                       pg[i].count_info, pg[i].u.inuse.type_info);
 +                shadow_lock(d);
 +                shadow_remove_all_access(d, page_to_pfn(&pg[i]));
 +                shadow_unlock(d);
 +                printk("Bmfn=%p c=%p t=%p\n", page_to_pfn(&pg[i]),
 +                       pg[i].count_info, pg[i].u.inuse.type_info);
 +            }
 +
              ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0);
              pg[i].tlbflush_timestamp  = tlbflush_current_time();
-             pg[i].u.free.cpu_mask     = cpu_mask;
+             pg[i].u.free.cpu_mask     = d->cpuset;
              list_del(&pg[i].list);
          }
  
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge